Library¶

In [ ]:
import pandas as pd
import numpy as np
import PreProcessingText as ppt
from collections import Counter, defaultdict
import seaborn as sns
from wordcloud import WordCloud
import networkx as nx
import matplotlib.pyplot as plt
import squarify
from transformers import pipeline
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from keybert import KeyBERT
from umap import UMAP
import hdbscan
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import csv
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.cluster import KMeans
from scipy.spatial import distance
from scipy.cluster import hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from matplotlib.colors import ListedColormap

4° Approach: BERTopic¶

Baseline Summary¶

Clustering Approach¶

  • Parameter Setting: A high parameter was set for HDBSCAN to ensure well-defined clusters that occupy a significant percentage of the total dataset. This baseline is intended to feed machine learning algorithms for prediction purposes.

Initial Clustering Results¶

  • Clusters Retrieved: 7 representative clusters were identified:

    1. Drug sales
    2. Bitcoin
    3. Scammers and seller reviews
    4. Marketplace advertising
    5. Purchase reviews
    6. Drug purchases
    7. Orders
  • Outliers: Initially, 34k outliers were found out of a total of 66k records.

  • Performance Metrics:

    • Silhouette Score: 0.64
    • Davies-Bouldin Score: 0.6

Outlier Reduction¶

  • Cosine Measure on Embeddings: By applying a cosine similarity measure with a 0.53 threshold, the number of outliers was reduced from 34k to 27k, reintroducing about 7k records.

  • Updated Performance Metrics:

    • Silhouette Score: 0.51
    • Davies-Bouldin Score: 0.8

Trade-off Analysis¶

  • Outlier Reintroduction: Reintroducing the outliers found a balance that prevented significant cluster degradation while keeping clusters well separated and defined, as evidenced by the graphs.

  • Cluster Distribution: The updated clusters are well-distributed:

    • Maximum cluster size: 23% of the total dataset
    • Minimum cluster size: 7% of the total dataset
    • This distribution avoids large excursions.

Data Loss and Potential Adjustments¶

  • Data Loss: Approximately 40% of the initial dataset was lost.

  • Potential Correction: This data loss can potentially be mitigated by lowering the cosine similarity threshold between embeddings.

¶

In [2]:
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
df.shape[0]
Out[2]:
66735
In [ ]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
tc1 = ppt.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
In [ ]:
len(tc1.corpus), len(tc1.corpus_embeddings)
In [ ]:
seed_topic_list = [[
    'tor site', 'drug', 'cocaine', 'ketamine', 'weed', 'trafficking', 'scammer', 'market', 'vendor', 'bitcoin',
    'mdma', 'coke', 'lsd', 'heroine', 'xanax', 'tor node', 'tor site', 'gun', 'weapon', 'hacking'
]]

zeroshot_topic_list = [pd.read_csv('../../../intent_crime.csv')['intent'].tolist()]

representation_model = MaximalMarginalRelevance(diversity=0.3)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=1200, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    language='multilingual',
    top_n_words=10, 
    n_gram_range=(1, 2),
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model, 
    seed_topic_list=seed_topic_list, 
    vectorizer_model=vectorizer_model, 
    ctfidf_model=ctfidf_model, 
    representation_model=representation_model,
    zeroshot_topic_list=zeroshot_topic_list, 
    zeroshot_min_similarity=.05, 
    verbose=True
)

topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
In [21]:
print(topic_model.get_topic_info())
for topic_id in set(topics):
    print(f"Topic {topic_id}:")
    print(topic_model.get_topic(topic_id))
   Topic  Count                               Name  \
0     -1  34449            -1_new_free_ticket_help   
1      0   7495          0_weed_xanax_cocaine_coke   
2      1   6093    1_market_dream_empire_nightmare   
3      2   5034     2_vendor_scammer_scam_scamming   
4      3   4087      3_review_vendor_feedback_mdma   
5      4   4003              4_mdma_lsd_shit_whats   
6      5   2402  5_order_package_delivery_shipping   
7      6   1966       6_bitcoin_card_wallet_credit   

                                      Representation  \
0  [new, free, ticket, help, update, account, mdm...   
1  [weed, xanax, cocaine, coke, ketamine, mg, can...   
2  [market, dream, empire, nightmare, vendor, wal...   
3  [vendor, scammer, scam, scamming, exit, scamme...   
4  [review, vendor, feedback, mdma, mg, sample, r...   
5  [mdma, lsd, shit, whats, fuck, gone, got, guy,...   
6  [order, package, delivery, shipping, tracking,...   
7  [bitcoin, card, wallet, credit, coin, carding,...   

                                 Representative_Docs  
0  [canadianflavor weed shatter cbd edible hash c...  
1  [high quality weed thc product europe, new xan...  
2  [next market, dream market vendor rstclass nig...  
3  [looking good reliable vendor sell ounce, vend...  
4  [empire vendor cocaine review, first ever revi...  
5  [hey ro im gon na pull pk, life wonderful life...  
6  [order accepted day still hasnt marked shipped...  
7  [credit cards paypal prepaid card find, got cc...  
Topic 0:
[('weed', 0.5972313505812425), ('xanax', 0.5664832282989213), ('cocaine', 0.5350787342936356), ('coke', 0.4710111701375004), ('ketamine', 0.46985128023380035), ('mg', 0.46256209204548415), ('cannabis', 0.41853925594172725), ('drug', 0.4053330171594432), ('pill', 0.3907822559981816), ('quality', 0.38621568363790615)]
Topic 1:
[('market', 0.892430998800942), ('dream', 0.6865843677324943), ('empire', 0.6830028029033173), ('nightmare', 0.5681939396872522), ('vendor', 0.34305231363817884), ('wall', 0.3245499595042113), ('marketplace', 0.319921898437173), ('scam', 0.2961241301762431), ('exit', 0.2960733863924834), ('link', 0.2915460778160393)]
Topic 2:
[('vendor', 0.6950361459297074), ('scammer', 0.6725026815231682), ('scam', 0.4980623980369779), ('scamming', 0.46575246018365657), ('exit', 0.44160475610894967), ('scammed', 0.40051759892624533), ('looking', 0.37884048200047027), ('warning', 0.37715463753082534), ('reliable', 0.37144259341974245), ('buyer', 0.3708904841304073)]
Topic 3:
[('review', 1.002255217202406), ('vendor', 0.5076272530565451), ('feedback', 0.4049037794348937), ('mdma', 0.381329954044546), ('mg', 0.37619091451980585), ('sample', 0.3754397070467268), ('reviews', 0.3504300951320543), ('lsd', 0.3465899767001684), ('opinion', 0.3303160657068881), ('xanax', 0.33022254366369147)]
Topic 4:
[('mdma', 0.38275973612659386), ('lsd', 0.3779572278615291), ('shit', 0.35340590919386444), ('whats', 0.34834774258692336), ('fuck', 0.3264035078860319), ('gone', 0.31797094824590016), ('got', 0.3167851762249627), ('guy', 0.3153758862961693), ('dead', 0.31361936874635366), ('going', 0.3042237209259171)]
Topic 5:
[('order', 0.9350712100343167), ('package', 0.6655706541276237), ('delivery', 0.562721266995139), ('shipping', 0.527231820138037), ('tracking', 0.5122872117651205), ('shipped', 0.48839280205239965), ('ordering', 0.4784769909883374), ('cancelled', 0.47119974969542505), ('pack', 0.4566507281813944), ('delivered', 0.45351148583756845)]
Topic 6:
[('bitcoin', 0.8235475804294793), ('card', 0.7734286502423073), ('wallet', 0.6772588642347616), ('credit', 0.6731588060336892), ('coin', 0.5703668040987371), ('carding', 0.5529443276986676), ('btc', 0.5121844608207589), ('cash', 0.5037356917020909), ('debit', 0.500260454896595), ('coinbase', 0.49454000630077194)]
Topic -1:
[('new', 0.28398750337326484), ('free', 0.2771677713524054), ('ticket', 0.2699448449851029), ('help', 0.2697705189262906), ('update', 0.2675394807401724), ('account', 0.26547262677161937), ('mdma', 0.2638718211547908), ('vendor', 0.2588459510247759), ('dispute', 0.25440435619535773), ('need', 0.2488688355528112)]
In [22]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6388389468193054
Davies_bouldin_score: 0.5523262827209047
In [24]:
best_indices = np.argsort(silhouette_scores)[-10:]
best_umap_embeddings = umap_embeddings[best_indices]
plt.figure(figsize=(10, 5))
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar()
plt.title('UMAP projection of the topics with highest silhouette scores', fontsize=24)
plt.show()
No description has been provided for this image
In [25]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
In [26]:
topic_model.get_topic_freq()
Out[26]:
Topic Count
0 -1 34449
5 0 7495
2 1 6093
1 2 5034
7 3 4087
6 4 4003
3 5 2402
4 6 1966
In [27]:
print(topic_model.get_topic_info())
for topic_id in set(topics):
    print(f"Topic {topic_id}:")
    print(topic_model.get_topic(topic_id))
   Topic  Count                                         Name  \
0     -1  34449                      -1_vendor_new_free_help   
1      0   7495                       0_weed_vendor_xanax_mg   
2      1   6093                 1_market_empire_dream_vendor   
3      2   5034                2_vendor_scammer_scam_looking   
4      3   4087  3_review_vendor review_vendor_review vendor   
5      4   4003                          4_mdma_lsd_good_got   
6      5   2402            5_order_package_shipping_delivery   
7      6   1966                 6_card_bitcoin_wallet_credit   

                                      Representation  \
0  [vendor, new, free, help, best, account, uk, u...   
1  [weed, vendor, xanax, mg, cocaine, uk, best, c...   
2  [market, empire, dream, vendor, nightmare, dre...   
3  [vendor, scammer, scam, looking, scamming, exi...   
4  [review, vendor review, vendor, review vendor,...   
5  [mdma, lsd, good, got, shit, whats, guy, fuck,...   
6  [order, package, shipping, delivery, vendor, p...   
7  [card, bitcoin, wallet, credit, btc, carding, ...   

                                 Representative_Docs  
0  [canadianflavor weed shatter cbd edible hash c...  
1  [high quality weed thc product europe, new xan...  
2  [next market, dream market vendor rstclass nig...  
3  [looking good reliable vendor sell ounce, vend...  
4  [empire vendor cocaine review, first ever revi...  
5  [hey ro im gon na pull pk, life wonderful life...  
6  [order accepted day still hasnt marked shipped...  
7  [credit cards paypal prepaid card find, got cc...  
Topic 0:
[('weed', 0.02425497350614531), ('vendor', 0.021978341010015688), ('xanax', 0.02077949072716719), ('mg', 0.01948517638840499), ('cocaine', 0.018417804414484252), ('uk', 0.015046793957699879), ('best', 0.013425752943917355), ('coke', 0.012717130457267087), ('ketamine', 0.01175969464362258), ('cannabis', 0.010948216683877144)]
Topic 1:
[('market', 0.09008978566905657), ('empire', 0.055274112551010335), ('dream', 0.04917325935832957), ('vendor', 0.024276714575283735), ('nightmare', 0.023605168431774765), ('dream market', 0.016025449931173885), ('empire market', 0.014646720705699409), ('new', 0.009033909010090109), ('nightmare market', 0.008867402221856543), ('scam', 0.006303868464254871)]
Topic 2:
[('vendor', 0.09965429794348642), ('scammer', 0.025788920958809015), ('scam', 0.017833603310448354), ('looking', 0.01337570071081538), ('scamming', 0.012208815488636926), ('exit', 0.011806364340026236), ('scammed', 0.008689720115543394), ('uk', 0.008678133768927804), ('good', 0.008493482524539575), ('warning', 0.008418582129949287)]
Topic 3:
[('review', 0.1428141634073404), ('vendor review', 0.058876246025626515), ('vendor', 0.05315846344525214), ('review vendor', 0.021049951157661017), ('review vendor review', 0.017406474951027713), ('review review', 0.015138695407876355), ('mg', 0.012888546716744416), ('mdma', 0.011146461993445255), ('sample', 0.010133356066428198), ('dream', 0.009783289767907996)]
Topic 4:
[('mdma', 0.011231558108969678), ('lsd', 0.009238251834183116), ('good', 0.007359917621616781), ('got', 0.006638868206622288), ('shit', 0.0065802885463340675), ('whats', 0.006051630264178851), ('guy', 0.005697866126116449), ('fuck', 0.005394916465354471), ('going', 0.005375411718474036), ('wsm', 0.0052967375805114646)]
Topic 5:
[('order', 0.09533424569336707), ('package', 0.025076372096897597), ('shipping', 0.02284913659637588), ('delivery', 0.018139605364174704), ('vendor', 0.014195026757439324), ('pack', 0.014024930561711633), ('tracking', 0.012976075064416448), ('shipped', 0.012741042718045418), ('ordering', 0.01153929794529684), ('time', 0.01087192180365464)]
Topic 6:
[('card', 0.04045581193563761), ('bitcoin', 0.03526436871145481), ('wallet', 0.02671909128748556), ('credit', 0.02286661027552805), ('btc', 0.0196385675748142), ('carding', 0.018970779081355412), ('coin', 0.016677548495845462), ('credit card', 0.014601870612078016), ('cash', 0.012420616388040553), ('bank', 0.010979756425111214)]
Topic -1:
[('vendor', 0.013820616851140987), ('new', 0.009152016420677532), ('free', 0.006913858221511509), ('help', 0.006453408973195096), ('best', 0.0060032500179123234), ('account', 0.005801364375676093), ('uk', 0.005664162822486113), ('update', 0.005547486073465391), ('crosspost', 0.005503646525948444), ('need', 0.00541678801673178)]
In [28]:
topic_model.visualize_topics()

7DistanceTimeSeries_0.641200-2.png

In [29]:
topic_model.visualize_heatmap()

7MatrixDistribution_0.64sil1200-2.png

In [30]:
topic_model.visualize_hierarchy()

image.png

In [31]:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

7ClusterDistribution_0.64sil1200.png

In [32]:
topic_model.visualize_barchart()

image.png

In [ ]:
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.53)
topic_model.update_topics(tc1.corpus, topics=new_topics)
In [75]:
topic_model.get_topic_info()
Out[75]:
Topic Count Name Representation Representative_Docs
0 -1 27613 -1_anyone_new_help_free [anyone, new, help, free, please, update, tick... [canadianflavor weed shatter cbd edible hash c...
1 0 8645 0_weed_xanax_vendor_cocaine [weed, xanax, vendor, cocaine, mg, uk, coke, b... [high quality weed thc product europe, new xan...
2 1 6236 1_market_empire_dream_nightmare [market, empire, dream, nightmare, vendor, dre... [next market, dream market vendor rstclass nig...
3 2 6907 2_vendor_scammer_scam_looking [vendor, scammer, scam, looking, scamming, sal... [looking good reliable vendor sell ounce, vend...
4 3 4230 3_review_vendor review_vendor_review vendor [review, vendor review, vendor, review vendor,... [empire vendor cocaine review, first ever revi...
5 4 6299 4_mdma_lsd_get_looking [mdma, lsd, get, looking, wsm, good, btc, ques... [hey ro im gon na pull pk, life wonderful life...
6 5 2776 5_order_package_shipping_delivery [order, package, shipping, delivery, pack, shi... [order accepted day still hasnt marked shipped...
7 6 2823 6_bitcoin_card_wallet_btc [bitcoin, card, wallet, btc, bank, credit, car... [credit cards paypal prepaid card find, got cc...
In [76]:
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

image-2.png

In [77]:
topic_model.visualize_hierarchy()

image-2.png

In [78]:
topic_model.visualize_topics()

image-2.png

In [88]:
topic_model.visualize_barchart()

7BarChartDistribution_0.64sil1200_after.png

In [79]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5083789229393005
Davies_bouldin_score: 0.7570962651091117
In [ ]:
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
In [ ]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=100)
In [82]:
topic_model.visualize_topics_over_time(topics_over_time, width=1250, height=700)

7ClusterTimeSeries_0.641200-3.png

In [83]:
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]

results = pd.DataFrame({
    'Document': corpus_valid,
    'Embedding': embeddings_valid,
    'Topic': topics_valid,
    'Probability': probs_valid,
    'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')

results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
(37916, 10)
Out[83]:
Document Embedding Topic Probability Created_on Count Name Representation Representative_Docs UMAP_embedding
0 checks [0.052164897, 0.029597273, -0.03666609, 0.0051... 4 0.000000 2020-01-09 6299 4_mdma_lsd_get_looking [mdma, lsd, get, looking, wsm, good, btc, ques... [hey ro im gon na pull pk, life wonderful life... [1.6488198, 9.914265, 1.442794, 2.8094368, -0....
1 trusted vendor status [0.02445144, -0.008732641, -0.0050215074, 0.01... 2 0.944247 2020-01-09 6907 2_vendor_scammer_scam_looking [vendor, scammer, scam, looking, scamming, sal... [looking good reliable vendor sell ounce, vend... [2.910516, 10.281041, 1.650234, 3.0320778, -0....
2 empire exit scam iiflux user incomming [0.02890829, 0.036081452, -0.027694924, -0.007... 1 1.000000 2019-11-06 6236 1_market_empire_dream_nightmare [market, empire, dream, nightmare, vendor, dre... [next market, dream market vendor rstclass nig... [1.5884036, 9.8587885, 3.3090453, 2.652358, 2....
3 ecstasy vendor packs [-0.022524439, 0.03949761, -0.023750877, 0.033... 5 0.797741 2020-01-09 2776 5_order_package_shipping_delivery [order, package, shipping, delivery, pack, shi... [order accepted day still hasnt marked shipped... [2.0245404, 10.517631, 2.3443217, 3.7595236, -...
4 opening bank account person fake id [-0.029834118, 0.03354508, -0.012210185, -0.02... 6 1.000000 2019-11-06 2823 6_bitcoin_card_wallet_btc [bitcoin, card, wallet, btc, bank, credit, car... [credit cards paypal prepaid card find, got cc... [0.7278271, 9.884823, 1.8116106, 2.9336705, -0...
In [ ]:
topic_model.save("Models/topic_model_0.64SilNew", serialization='pickle')
In [85]:
results_final.to_parquet('ResultsBERTopic/BERTopic_nodefinedcluster_topics_15n_10com_1200cluster_0.64sil_renewout.parquet')
In [86]:
sns.histplot(results_final, x='Topic', discrete=True);
No description has been provided for this image
In [87]:
plt.pie(results_final.value_counts('Topic'), labels=results_final.value_counts('Topic').index, autopct='%1.1f%%');
No description has been provided for this image

500 min cluster size¶

In [ ]:
seed_topic_list = [[
    'tor site', 'drug', 'cocaine', 'ketamine', 'weed', 'trafficking', 'scammer', 'market', 'vendor', 'bitcoin',
    'mdma', 'coke', 'lsd', 'heroine', 'xanax', 'tor node', 'tor site', 'gun', 'weapon', 'hacking'
]]

zeroshot_topic_list = [pd.read_csv('../../../intent_crime.csv')['intent'].tolist()]

representation_model = MaximalMarginalRelevance(diversity=0.3)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=500, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    language='multilingual',
    top_n_words=10, 
    n_gram_range=(1, 2),
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model, 
    seed_topic_list=seed_topic_list, 
    vectorizer_model=vectorizer_model, 
    ctfidf_model=ctfidf_model, 
    representation_model=representation_model,
    zeroshot_topic_list=zeroshot_topic_list, 
    zeroshot_min_similarity=.05, 
    verbose=True
)

topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
In [59]:
topic_model.get_topic_info()
Out[59]:
Topic Count Name Representation Representative_Docs
0 -1 28000 -1_mdma_new_link_lsd [mdma, new, link, lsd, help, free, vendor, nee... [need high quality fake id check , big thanks ...
1 0 4930 0_xanax_coke_cocaine_ketamine [xanax, coke, cocaine, ketamine, mg, drug, pil... [promo sale mg adderall ad xanax mg lsd mdma u...
2 1 4469 1_bitcoin_card_bank_carding [bitcoin, card, bank, carding, monero, wallet,... [way cash bank log using btc, send bitcoin get...
3 2 4227 2_dread_sub_lsd_shit [dread, sub, lsd, shit, mdma, whats, guy, fuck... [hey guy xangod man, let guy know dread host w...
4 3 3702 3_market_dream_nightmare_dreammarket [market, dream, nightmare, dreammarket, market... [not order nightmare market, nightmare market ...
5 4 3469 4_review_vendor_reviews_mg [review, vendor, reviews, mg, vendymcvendface,... [thclear ml purple kush vape cart review, vend...
6 5 3410 5_order_package_pack_dispute [order, package, pack, dispute, delivery, ship... [package custom month love letter nothing, pac...
7 6 2700 6_vendor_looking_seller_vendors [vendor, looking, seller, vendors, buyer, lsd,... [best vendor uk lsd, looking good vendor cc fu...
8 7 1694 7_weed_cannabis_marijuana_hash [weed, cannabis, marijuana, hash, quality, str... [hash weed ship eu good vendor also usa, new i...
9 8 1540 8_darknet_dark_web_sentenced [darknet, dark, web, sentenced, drug, darkweb,... [tacoma man sentenced four year dealing drugs ...
10 9 1502 9_empire_dispute_deposit_empiremarket [empire, dispute, deposit, empiremarket, scamm... [empire next, give me empire, empire anyone else]
11 10 1475 10_account_password_pgp_hacking [account, password, pgp, hacking, hacked, secu... [vendor enerygcontrolled hacked ca nt log pass...
12 11 1314 11_tried_anybody_heard_ordered [tried, anybody, heard, ordered, used, recentl... [anybody heard pasitheas, anyone order recentl...
13 12 1031 12_scammer_scam_exit_scamming [scammer, scam, exit, scamming, warning, scamm... [xangod scammer going exit scam proof, cottage...
14 13 777 13_update_maintenance_updated_upgrade [update, maintenance, updated, upgrade, vender... [shipping update, update order, vendor update]
15 14 681 14_ticket_support_deposit_month [ticket, support, deposit, month, response, an... [support ticket ticket, please help support ti...
16 15 608 15_sample_samples_free_test [sample, samples, free, test, testing, lab, te... [xanax mg shipping free samples, new vendor fr...
In [60]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5718363523483276
Davies_bouldin_score: 0.6211900149809264
In [61]:
best_indices = np.argsort(silhouette_scores)[-10:]
best_umap_embeddings = umap_embeddings[best_indices]
unique_labels = np.unique(labels)
cmap = plt.cm.magma
plt.figure(figsize=(10, 5))
scatter = plt.scatter(X[:, 1], X[:, 2], c=labels, cmap=cmap, s=5)
plt.gca().set_aspect('equal', 'datalim')
norm = plt.Normalize(vmin=min(labels), vmax=max(labels))
handles = [plt.Line2D([0], [0], marker='o', color=cmap(norm(label)), linestyle='', markersize=10) for label in unique_labels]
legend_labels = [f'Class {label}' for label in unique_labels]
plt.legend(handles, legend_labels, title="Classes")
plt.colorbar(scatter, ticks=range(len(unique_labels)))
plt.title('UMAP projection of the topics with highest silhouette scores', fontsize=24)
plt.show()
No description has been provided for this image
In [62]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
In [63]:
topic_model.visualize_topics()

16DistanceDistribution_0.5sil500.png

In [64]:
topic_model.visualize_heatmap()

16MatrixDistribution_0.5sil500.png

In [65]:
topic_model.visualize_hierarchy()

16HirachicalDistribution_0.5sil500.png

In [66]:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

16ClusterDistribution_0.5sil500.png

In [ ]:
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.6)
topic_model.update_topics(tc1.corpus, topics=new_topics)
In [96]:
topic_model.get_topic_info()
Out[96]:
Topic Count Name Representation Representative_Docs
0 -1 23928 -1_new_vendor_help_uk [new, vendor, help, uk, need, mdma, best, free... [need high quality fake id check , big thanks ...
1 0 5207 0_xanax_cocaine_mg_coke [xanax, cocaine, mg, coke, ketamine, vendor, p... [promo sale mg adderall ad xanax mg lsd mdma u...
2 1 4512 1_bitcoin_card_bank_carding [bitcoin, card, bank, carding, monero, wallet,... [way cash bank log using btc, send bitcoin get...
3 2 4944 2_dread_mdma_lsd_get [dread, mdma, lsd, get, sub, shit, guy, lookin... [hey guy xangod man, let guy know dread host w...
4 3 3801 3_market_dream_nightmare_dream market [market, dream, nightmare, dream market, vendo... [not order nightmare market, nightmare market ...
5 4 3706 4_review_vendor review_vendor_review vendor [review, vendor review, vendor, review vendor,... [thclear ml purple kush vape cart review, vend...
6 5 3434 5_order_dispute_pack_package [order, dispute, pack, package, shipping, deli... [package custom month love letter nothing, pac...
7 6 4123 6_vendor_vendor vendor_looking_best [vendor, vendor vendor, looking, best, inquiry... [best vendor uk lsd, looking good vendor cc fu...
8 7 1848 7_weed_cannabis_uk_weed vendor [weed, cannabis, uk, weed vendor, vendor, qual... [hash weed ship eu good vendor also usa, new i...
9 8 1557 8_darknet_dark_dark web_web [darknet, dark, dark web, web, drug, sentenced... [tacoma man sentenced four year dealing drugs ...
10 9 1835 9_empire_empire market_market_empire empire [empire, empire market, market, empire empire,... [empire next, give me empire, empire anyone else]
11 10 1542 10_account_pgp_password_vendor account [account, pgp, password, vendor account, crypt... [vendor enerygcontrolled hacked ca nt log pass...
12 11 1394 11_anyone_has_has anyone_anybody [anyone, has, has anyone, anybody, tried, anyo... [anybody heard pasitheas, anyone order recentl...
13 12 1398 12_scammer_scam_exit_scamming [scammer, scam, exit, scamming, scammed, warni... [xangod scammer going exit scam proof, cottage...
14 13 826 13_update_maintenance_updated_update update [update, maintenance, updated, update update, ... [shipping update, update order, vendor update]
15 14 682 14_ticket_support ticket_support_please [ticket, support ticket, support, please, depo... [support ticket ticket, please help support ti...
16 15 792 15_sample_free_free sample_samples [sample, free, free sample, samples, free samp... [xanax mg shipping free samples, new vendor fr...
In [97]:
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

16ClusterDistribution_0.5sil500_after.png

In [98]:
topic_model.visualize_topics()

16DistanceDistribution_0.5sil500_after.png

In [106]:
topic_model.visualize_hierarchy()

16HirachicalDistribution_0.5sil500_after.png

In [15]:
topic_model.visualize_barchart(top_n_topics=16)

16BarChartDistribution_0.5sil500_after-2.png

In [99]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.49986162781715393
Davies_bouldin_score: 0.7193546666619981
In [100]:
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]

results = pd.DataFrame({
    'Document': corpus_valid,
    'Embedding': embeddings_valid,
    'Topic': topics_valid,
    'Probability': probs_valid,
    'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')

results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
results_final.to_parquet('ResultsBERTopic/BERTopic_nodefinedcluster_topics_15n_10com_500cluster_0.54sil_renewout.parquet')
(41601, 10)
In [ ]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, width=1250, height=700)

16ClusterTimeSeries500.png

In [107]:
plt.pie(results_final.value_counts('Topic'), labels=results_final.value_counts('Topic').index, autopct='%1.1f%%');
No description has been provided for this image
In [108]:
sns.histplot(results_final, x='Topic', discrete=True);
No description has been provided for this image
In [ ]:
topic_model.save("Models/topic_model_0.50Sil300", serialization='pickle')

400 all-MiniLM-L6-v2¶

In [3]:
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
df.shape[0]
Out[3]:
66735
In [ ]:
model = SentenceTransformer('all-MiniLM-L6-v2')
tc1 = ppt.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
In [ ]:
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=400, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    top_n_words=10, 
    n_gram_range=(1, 2),
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model, 
    ctfidf_model=ctfidf_model, 
    representation_model=[mmr, kw],
    embedding_model=model,
    verbose=True
)

topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
In [23]:
print(topic_model.get_topic_info())
for topic_id in set(topics):
    print(f"Topic {topic_id}:")
    print(topic_model.get_topic(topic_id))
    Topic  Count                                               Name  \
0      -1  30941                         -1_customer_buy_sale_buyer   
1       0   5117                     0_vape_shatter_carts_cartridge   
2       1   2643                      1_login_password_logged_error   
3       2   2579                2_coca_opium_cocain_cocacolacompany   
4       3   2124               3_xanaxlabs_xanaxlife_xanax_xanaxusa   
5       4   1938                     4_postal_usps_delivery_postage   
6       5   1842     5_darkweb_darknetlive_darknetmarkets_sentenced   
7       6   1721      6_empire_empiremarket_empireteam_empiredealer   
8       7   1631                 7_mdma_mdmamaster_pill_ecstasydata   
9       8   1601               8_giftcard_card_giftcards_mastercard   
10      9   1502              9_vendor_vendorpro_vendors_vendorbbmc   
11     10   1417                  10_scamming_scammer_scam_scammers   
12     11   1126      11_counterfeiting_passport_counterfeit_fakeid   
13     12   1072       12_dreammarket_nightmaremarket_market_dreams   
14     13    979                            13_lsd_tab_tabs_shrooms   
15     14    739                      14_monero_coinbase_coin_coins   
16     15    676               15_review_reviewing_reviews_reviewer   
17     16    674           16_pickledrick_heard_theoutfit_muttznutz   
18     17    669            17_market_markets_marketplace_marketing   
19     18    626         18_crosspost_deposting_goingpostal_vendors   
20     19    603              19_deposit_depositing_deposits_ticket   
21     20    573                              20_pgpkey_pgp_pgps_pg   
22     21    535                  21_mod_moderator_dispute_disputes   
23     22    450  22_cryptonia_cryptoniausers_cryptonians_cryptn...   
24     23    445                 23_wsm_wsms_vendorcp_machinerymint   
25     24    443     24_ketamine_ketamin_ketamineking_ketaminekings   
26     25    434             25_ticket_ticketmaster_ticketw_tickets   
27     26    429        26_meth_methbusters_methamphetamine_crystal   

                                       Representation  \
0   [customer, buy, sale, buyer, service, message,...   
1   [vape, shatter, carts, cartridge, ounce, marij...   
2   [login, password, logged, error, problem, log,...   
3   [coca, opium, cocain, cocacolacompany, coke, c...   
4   [xanaxlabs, xanaxlife, xanax, xanaxusa, xanaxr...   
5   [postal, usps, delivery, postage, mail, delive...   
6   [darkweb, darknetlive, darknetmarkets, sentenc...   
7   [empire, empiremarket, empireteam, empiredeale...   
8   [mdma, mdmamaster, pill, ecstasydata, mdmaus, ...   
9   [giftcard, card, giftcards, mastercard, cards,...   
10  [vendor, vendorpro, vendors, vendorbbmc, vendo...   
11  [scamming, scammer, scam, scammers, scammed, s...   
12  [counterfeiting, passport, counterfeit, fakeid...   
13  [dreammarket, nightmaremarket, market, dreams,...   
14  [lsd, tab, tabs, shrooms, acid, blotter, blott...   
15  [monero, coinbase, coin, coins, cryptocurrency...   
16  [review, reviewing, reviews, reviewer, reviewe...   
17  [pickledrick, heard, theoutfit, muttznutz, hou...   
18  [market, markets, marketplace, marketing, nonm...   
19  [crosspost, deposting, goingpostal, vendors, c...   
20  [deposit, depositing, deposits, ticket, deposi...   
21  [pgpkey, pgp, pgps, pg, pgc, gnupg, key, gpg, ...   
22  [mod, moderator, dispute, disputes, disputers,...   
23  [cryptonia, cryptoniausers, cryptonians, crypt...   
24  [wsm, wsms, vendorcp, machinerymint, wowza, pa...   
25  [ketamine, ketamin, ketamineking, ketamineking...   
26  [ticket, ticketmaster, ticketw, tickets, suppo...   
27  [meth, methbusters, methamphetamine, crystal, ...   

                                  Representative_Docs  
0   [dutchdrugz updates promo active till market p...  
1   [sale girl scout cookie carts strains oz lb us...  
2   [hey really could use help advice thanks, erro...  
3   [colombian coke brazil ship world wide promoti...  
4   [adderall mg ir adderall mg xanax super sale, ...  
5   [informed delivery showing package, usa canada...  
6   [three student arrested dark web drug traffick...  
7   [empire anyone else, empire market back, empir...  
8   [sale xtc pill mg mda us ca, uk mdma pill vend...  
9   [carding amazon gift card, gift card prepaid d...  
10  [nmm giving vendor runaround lying acting shad...  
11  [market exit scam next, scam alert ukdrugdeale...  
12  [buy counterfeit money real fake document, buy...  
13  [dream market still, dream market, eleven drea...  
14  [lsd blotter tab ug top quality, point one fre...  
15  [looking best safe way buy large amount bitcoi...  
16  [needing send sample bar trusted reviewer woul...  
17  [anybody heard theoutfit, anybody heard pickle...  
18  [market anyone else, market, currently working...  
19  [envoy want crosspost, could vendor crosspost,...  
20  [missing deposit double deposit please help, a...  
21     [pgp public key, market pgp key, find pgp key]  
22  [moderator dispute day, moderator please help ...  
23  [cryptonia market, market king samsara crypton...  
24                   [wsm vendor, wsm back, wsm down]  
25       [ketamine us, get ketamine, ketamine anyone]  
26  [help support ticket please, help support tick...  
27  [crystal meth uk, crystal meth, crystal meth v...  
Topic 0:
[('vape', 0.4513024), ('shatter', 0.4508166), ('carts', 0.42475972), ('cartridge', 0.4150574), ('ounce', 0.38511506), ('marijuana', 0.3761327), ('cannabis', 0.37473193), ('edibles', 0.36946523), ('weed', 0.35874215), ('cart', 0.3494926)]
Topic 1:
[('login', 0.6874596), ('password', 0.58739483), ('logged', 0.44535103), ('error', 0.39473626), ('problem', 0.38404456), ('log', 0.3703017), ('account', 0.36962464), ('help', 0.36578366), ('trouble', 0.3579351), ('session', 0.34920555)]
Topic 2:
[('coca', 0.5442445), ('opium', 0.5241908), ('cocain', 0.48566723), ('cocacolacompany', 0.47682497), ('coke', 0.4701375), ('cocainehcl', 0.4403491), ('cocaine', 0.43470532), ('heroinfactory', 0.43406424), ('colombian', 0.40406665), ('cokemaster', 0.39702898)]
Topic 3:
[('xanaxlabs', 0.68098766), ('xanaxlife', 0.6694618), ('xanax', 0.64481914), ('xanaxusa', 0.5943617), ('xanaxring', 0.5927005), ('xanaxdepot', 0.5860753), ('xanaxdaddy', 0.57530177), ('xanaxblotters', 0.5676911), ('alprazolam', 0.5388765), ('xanaxinc', 0.5038374)]
Topic 4:
[('postal', 0.5783647), ('usps', 0.5671008), ('delivery', 0.552514), ('postage', 0.5435632), ('mail', 0.4794371), ('deliver', 0.46840727), ('package', 0.4595977), ('shipment', 0.4503156), ('shipping', 0.44325382), ('fedex', 0.44258836)]
Topic 5:
[('darkweb', 0.5460649), ('darknetlive', 0.47999817), ('darknetmarkets', 0.46108282), ('sentenced', 0.4581046), ('darknetmarketsnoobs', 0.4534067), ('darknet', 0.45285586), ('darkbay', 0.45059866), ('darkfail', 0.44140962), ('darkdotfail', 0.42702472), ('darknetaustralia', 0.42165762)]
Topic 6:
[('empire', 0.8657665), ('empiremarket', 0.8325376), ('empireteam', 0.7658358), ('empiredealer', 0.73584473), ('empires', 0.7089321), ('imperial', 0.59743464), ('imperialroyalty', 0.533589), ('market', 0.39446667), ('scammer', 0.3011508), ('nightmare', 0.29797795)]
Topic 7:
[('mdma', 0.57491755), ('mdmamaster', 0.55362886), ('pill', 0.54554516), ('ecstasydata', 0.54158187), ('mdmaus', 0.536477), ('mdacanada', 0.49906433), ('mda', 0.47733676), ('md', 0.47456974), ('ecstasy', 0.46981525), ('mg', 0.45221412)]
Topic 8:
[('giftcard', 0.68464833), ('card', 0.6067195), ('giftcards', 0.60337466), ('mastercard', 0.5686253), ('cards', 0.5325688), ('carding', 0.5214343), ('debit', 0.500812), ('carded', 0.49536285), ('carder', 0.48081687), ('cardable', 0.45047107)]
Topic 9:
[('vendor', 0.6717965), ('vendorpro', 0.64170885), ('vendors', 0.63945156), ('vendorbbmc', 0.6131782), ('vendorshop', 0.5619679), ('supplier', 0.4961744), ('shop', 0.43687624), ('inventory', 0.38063982), ('dealer', 0.37658587), ('trusted', 0.35675985)]
Topic 10:
[('scamming', 0.67339057), ('scammer', 0.64245546), ('scam', 0.6315777), ('scammers', 0.60618246), ('scammed', 0.5859374), ('scams', 0.5844768), ('exit', 0.38286078), ('ukdrugdealer', 0.37872887), ('warning', 0.35860184), ('confirmed', 0.3483911)]
Topic 11:
[('counterfeiting', 0.5351553), ('passport', 0.49532643), ('counterfeit', 0.48550797), ('fakeid', 0.46835682), ('forgery', 0.46821818), ('passports', 0.46553856), ('certificate', 0.46403533), ('fakeids', 0.36332572), ('licenses', 0.3491515), ('citizenship', 0.33687454)]
Topic 12:
[('dreammarket', 0.840524), ('nightmaremarket', 0.7301478), ('market', 0.679103), ('dreams', 0.5537206), ('nightmare', 0.54951864), ('dream', 0.52395815), ('dreaming', 0.51259714), ('nightmares', 0.5112673), ('dreamweaver', 0.4622426), ('deals', 0.4392535)]
Topic 13:
[('lsd', 0.6597349), ('tab', 0.4486916), ('tabs', 0.42244914), ('shrooms', 0.40983063), ('acid', 0.37709463), ('blotter', 0.3619333), ('blotters', 0.34030285), ('microdose', 0.31792137), ('dmt', 0.30784056), ('samspade', 0.306018)]
Topic 14:
[('monero', 0.66440576), ('coinbase', 0.6017641), ('coin', 0.58206344), ('coins', 0.55229485), ('cryptocurrency', 0.54781383), ('crypto', 0.5190888), ('bitcoin', 0.49815544), ('btc', 0.4951193), ('cryptocurrencies', 0.49073264), ('bitcoins', 0.48276216)]
Topic 15:
[('review', 0.7554549), ('reviewing', 0.70764035), ('reviews', 0.67082256), ('reviewer', 0.6707778), ('reviewed', 0.66799235), ('vendor', 0.3507808), ('post', 0.3232708), ('sample', 0.3039448), ('journal', 0.28708428), ('dankservices', 0.2783244)]
Topic 16:
[('pickledrick', 0.49188858), ('heard', 0.45528996), ('theoutfit', 0.4499943), ('muttznutz', 0.40856874), ('houseofdank', 0.38270152), ('purepharm', 0.3821613), ('thecandymanuk', 0.38004813), ('ndduk', 0.3797817), ('uzak', 0.37892848), ('turk', 0.37287065)]
Topic 17:
[('market', 0.9246511), ('markets', 0.82856095), ('marketplace', 0.66924006), ('marketing', 0.64059925), ('nonmarket', 0.63226146), ('undermarket', 0.5758176), ('traderoute', 0.5252505), ('farmersmarket', 0.51230544), ('demand', 0.48939776), ('trade', 0.4373095)]
Topic 18:
[('crosspost', 0.8023433), ('deposting', 0.54462177), ('goingpostal', 0.4369921), ('vendors', 0.3397432), ('courier', 0.31433263), ('tarred', 0.30136013), ('expose', 0.28236645), ('shop', 0.26232204), ('buyers', 0.25981808), ('weareamsterdam', 0.25617945)]
Topic 19:
[('deposit', 0.5940467), ('depositing', 0.54835135), ('deposits', 0.4703769), ('ticket', 0.4124618), ('deposited', 0.37039375), ('transaction', 0.32960162), ('btc', 0.29055083), ('fund', 0.28815228), ('unconfirmed', 0.28022093), ('twice', 0.27061075)]
Topic 20:
[('pgpkey', 0.78953433), ('pgp', 0.64266664), ('pgps', 0.60433674), ('pg', 0.57204497), ('pgc', 0.5202303), ('gnupg', 0.49523085), ('key', 0.4912796), ('gpg', 0.45877883), ('keys', 0.42667422), ('pgplogin', 0.40541986)]
Topic 21:
[('mod', 0.6461178), ('moderator', 0.6455801), ('dispute', 0.63188905), ('disputes', 0.53940743), ('disputers', 0.5393207), ('mods', 0.5271941), ('complaint', 0.47743487), ('modderator', 0.43813834), ('consensus', 0.3737623), ('handled', 0.37211758)]
Topic 22:
[('cryptonia', 0.82683897), ('cryptoniausers', 0.7519192), ('cryptonians', 0.7422215), ('cryptnonia', 0.6530852), ('cryptoni', 0.6209998), ('cryptoice', 0.5572725), ('market', 0.5073216), ('samasara', 0.42220467), ('samsera', 0.42188087), ('samsara', 0.3912958)]
Topic 23:
[('wsm', 0.8689953), ('wsms', 0.6338644), ('vendorcp', 0.41763154), ('machinerymint', 0.36969972), ('wowza', 0.36484522), ('paymwn', 0.32914096), ('maintenance', 0.31149185), ('greennz', 0.3085622), ('bionik', 0.30364022), ('bioniks', 0.30257553)]
Topic 24:
[('ketamine', 0.9532861), ('ketamin', 0.86957943), ('ketamineking', 0.8578399), ('ketaminekings', 0.8378519), ('ketaminehouse', 0.8028732), ('ketamax', 0.69982356), ('ketaconnect', 0.527894), ('tiletamine', 0.5001087), ('pyrimethamine', 0.48265585), ('pharmaceutical', 0.43739906)]
Topic 25:
[('ticket', 0.7282917), ('ticketmaster', 0.6860643), ('ticketw', 0.65911514), ('tickets', 0.62922376), ('support', 0.51385075), ('concert', 0.37351736), ('help', 0.29014573), ('assist', 0.28098187), ('fix', 0.27553594), ('outstanding', 0.27276954)]
Topic 26:
[('meth', 0.7546984), ('methbusters', 0.71206135), ('methamphetamine', 0.6617794), ('crystal', 0.6237694), ('methamph', 0.6163767), ('methoxetamine', 0.6146395), ('methadone', 0.58694017), ('dmethamphetamine', 0.5264992), ('methaqualone', 0.49982086), ('amphetamine', 0.49571955)]
Topic -1:
[('customer', 0.44219303), ('buy', 0.42263174), ('sale', 0.38992852), ('buyer', 0.38299185), ('service', 0.38183293), ('message', 0.37282392), ('update', 0.37055105), ('price', 0.37036857), ('paypal', 0.35097662), ('legit', 0.34381357)]
In [24]:
topic_model.get_topic_info()
Out[24]:
Topic Count Name Representation Representative_Docs
0 -1 30941 -1_customer_buy_sale_buyer [customer, buy, sale, buyer, service, message,... [dutchdrugz updates promo active till market p...
1 0 5117 0_vape_shatter_carts_cartridge [vape, shatter, carts, cartridge, ounce, marij... [sale girl scout cookie carts strains oz lb us...
2 1 2643 1_login_password_logged_error [login, password, logged, error, problem, log,... [hey really could use help advice thanks, erro...
3 2 2579 2_coca_opium_cocain_cocacolacompany [coca, opium, cocain, cocacolacompany, coke, c... [colombian coke brazil ship world wide promoti...
4 3 2124 3_xanaxlabs_xanaxlife_xanax_xanaxusa [xanaxlabs, xanaxlife, xanax, xanaxusa, xanaxr... [adderall mg ir adderall mg xanax super sale, ...
5 4 1938 4_postal_usps_delivery_postage [postal, usps, delivery, postage, mail, delive... [informed delivery showing package, usa canada...
6 5 1842 5_darkweb_darknetlive_darknetmarkets_sentenced [darkweb, darknetlive, darknetmarkets, sentenc... [three student arrested dark web drug traffick...
7 6 1721 6_empire_empiremarket_empireteam_empiredealer [empire, empiremarket, empireteam, empiredeale... [empire anyone else, empire market back, empir...
8 7 1631 7_mdma_mdmamaster_pill_ecstasydata [mdma, mdmamaster, pill, ecstasydata, mdmaus, ... [sale xtc pill mg mda us ca, uk mdma pill vend...
9 8 1601 8_giftcard_card_giftcards_mastercard [giftcard, card, giftcards, mastercard, cards,... [carding amazon gift card, gift card prepaid d...
10 9 1502 9_vendor_vendorpro_vendors_vendorbbmc [vendor, vendorpro, vendors, vendorbbmc, vendo... [nmm giving vendor runaround lying acting shad...
11 10 1417 10_scamming_scammer_scam_scammers [scamming, scammer, scam, scammers, scammed, s... [market exit scam next, scam alert ukdrugdeale...
12 11 1126 11_counterfeiting_passport_counterfeit_fakeid [counterfeiting, passport, counterfeit, fakeid... [buy counterfeit money real fake document, buy...
13 12 1072 12_dreammarket_nightmaremarket_market_dreams [dreammarket, nightmaremarket, market, dreams,... [dream market still, dream market, eleven drea...
14 13 979 13_lsd_tab_tabs_shrooms [lsd, tab, tabs, shrooms, acid, blotter, blott... [lsd blotter tab ug top quality, point one fre...
15 14 739 14_monero_coinbase_coin_coins [monero, coinbase, coin, coins, cryptocurrency... [looking best safe way buy large amount bitcoi...
16 15 676 15_review_reviewing_reviews_reviewer [review, reviewing, reviews, reviewer, reviewe... [needing send sample bar trusted reviewer woul...
17 16 674 16_pickledrick_heard_theoutfit_muttznutz [pickledrick, heard, theoutfit, muttznutz, hou... [anybody heard theoutfit, anybody heard pickle...
18 17 669 17_market_markets_marketplace_marketing [market, markets, marketplace, marketing, nonm... [market anyone else, market, currently working...
19 18 626 18_crosspost_deposting_goingpostal_vendors [crosspost, deposting, goingpostal, vendors, c... [envoy want crosspost, could vendor crosspost,...
20 19 603 19_deposit_depositing_deposits_ticket [deposit, depositing, deposits, ticket, deposi... [missing deposit double deposit please help, a...
21 20 573 20_pgpkey_pgp_pgps_pg [pgpkey, pgp, pgps, pg, pgc, gnupg, key, gpg, ... [pgp public key, market pgp key, find pgp key]
22 21 535 21_mod_moderator_dispute_disputes [mod, moderator, dispute, disputes, disputers,... [moderator dispute day, moderator please help ...
23 22 450 22_cryptonia_cryptoniausers_cryptonians_cryptn... [cryptonia, cryptoniausers, cryptonians, crypt... [cryptonia market, market king samsara crypton...
24 23 445 23_wsm_wsms_vendorcp_machinerymint [wsm, wsms, vendorcp, machinerymint, wowza, pa... [wsm vendor, wsm back, wsm down]
25 24 443 24_ketamine_ketamin_ketamineking_ketaminekings [ketamine, ketamin, ketamineking, ketamineking... [ketamine us, get ketamine, ketamine anyone]
26 25 434 25_ticket_ticketmaster_ticketw_tickets [ticket, ticketmaster, ticketw, tickets, suppo... [help support ticket please, help support tick...
27 26 429 26_meth_methbusters_methamphetamine_crystal [meth, methbusters, methamphetamine, crystal, ... [crystal meth uk, crystal meth, crystal meth v...
In [25]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6434006690979004
Davies_bouldin_score: 0.4681034572960446
In [26]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
In [27]:
topic_model.visualize_topics()

27DistanceTimeSeries_0.65_400.png

In [28]:
topic_model.visualize_heatmap()

image.png

In [29]:
topic_model.visualize_hierarchy()

image.png

In [30]:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

image.png

In [121]:
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.5)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
2024-06-27 14:34:02,549 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
Out[121]:
Topic Count Name Representation Representative_Docs
0 -1 27323 -1_anyone_vendor_order_review [anyone, vendor, order, review, new, get, acco... [dutchdrugz updates promo active till market p...
1 0 5137 0_weed_cannabis_cart_review [weed, cannabis, cart, review, thc, vendor, oz... [sale girl scout cookie carts strains oz lb us...
2 1 2700 1_help_login_need_account [help, login, need, account, sub, back, passwo... [hey really could use help advice thanks, erro...
3 2 2601 2_cocaine_coke_heroin_drug [cocaine, coke, heroin, drug, vendor, uk, best... [colombian coke brazil ship world wide promoti...
4 3 2270 3_xanax_mg_adderall_alprazolam [xanax, mg, adderall, alprazolam, bar, diazepa... [adderall mg ir adderall mg xanax super sale, ...
5 4 2031 4_order_shipping_package_delivery [order, shipping, package, delivery, shipped, ... [informed delivery showing package, usa canada...
6 5 1861 5_darknet_dark_tor_web [darknet, dark, tor, web, onion, dark web, dar... [three student arrested dark web drug traffick...
7 6 1826 6_empire_empire market_empire empire_market [empire, empire market, empire empire, market,... [empire anyone else, empire market back, empir...
8 7 1653 7_mdma_pill_mda_xtc [mdma, pill, mda, xtc, mdma vendor, mg, usa, p... [sale xtc pill mg mda us ca, uk mdma pill vend...
9 8 1628 8_card_carding_cc_credit [card, carding, cc, credit, cvv, credit card, ... [carding amazon gift card, gift card prepaid d...
10 9 3010 9_vendor_vendor vendor_inquiry_vendor inquiry [vendor, vendor vendor, inquiry, vendor inquir... [nmm giving vendor runaround lying acting shad...
11 10 1741 10_scam_scammer_exit_scamming [scam, scammer, exit, scamming, scammed, exit ... [market exit scam next, scam alert ukdrugdeale...
12 11 1147 11_counterfeit_id_fake_passport [counterfeit, id, fake, passport, fake id, not... [buy counterfeit money real fake document, buy...
13 12 1202 12_dream_nightmare_dream market_market [dream, nightmare, dream market, market, night... [dream market still, dream market, eleven drea...
14 13 1009 13_lsd_ug_tab_lsd vendor [lsd, ug, tab, lsd vendor, acid, free, lsd tab... [lsd blotter tab ug top quality, point one fre...
15 14 854 14_monero_btc_bitcoin_coin [monero, btc, bitcoin, coin, crypto, wallet, b... [looking best safe way buy large amount bitcoi...
16 15 926 15_review_vendor review_vendor_review vendor [review, vendor review, vendor, review vendor,... [needing send sample bar trusted reviewer woul...
17 16 681 16_heard_anyone_anyone heard_happened [heard, anyone, anyone heard, happened, has, h... [anybody heard theoutfit, anybody heard pickle...
18 17 989 17_market_market market_new market_new [market, market market, new market, new, apoll... [market anyone else, market, currently working...
19 18 764 18_crosspost_review crosspost_crosspost vendor... [crosspost, review crosspost, crosspost vendor... [envoy want crosspost, could vendor crosspost,...
20 19 671 19_deposit_deposited_ticket_address [deposit, deposited, ticket, address, double, ... [missing deposit double deposit please help, a...
21 20 596 20_pgp_key_pgp key_public [pgp, key, pgp key, public, public pgp, messag... [pgp public key, market pgp key, find pgp key]
22 21 551 21_dispute_dispute dispute_mod_moderator [dispute, dispute dispute, mod, moderator, ple... [moderator dispute day, moderator please help ...
23 22 480 22_cryptonia_samsara_samsara market_cryptonia ... [cryptonia, samsara, samsara market, cryptonia... [cryptonia market, market king samsara crypton...
24 23 485 23_wsm_wsm wsm_wsm vendor_vendor wsm [wsm, wsm wsm, wsm vendor, vendor wsm, vendor,... [wsm vendor, wsm back, wsm down]
25 24 468 24_ketamine_ketamine vendor_mdma ketamine_keta... [ketamine, ketamine vendor, mdma ketamine, ket... [ketamine us, get ketamine, ketamine anyone]
26 25 458 25_ticket_support ticket_support_please [ticket, support ticket, support, please, mont... [help support ticket please, help support tick...
27 26 467 26_meth_crystal meth_crystal_meth vendor [meth, crystal meth, crystal, meth vendor, met... [crystal meth uk, crystal meth, crystal meth v...
In [153]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
In [ ]:
zero_shot_topics = pd.read_csv('../../../intent_crime.csv')['intent'].tolist()
dict_zero_shots_25 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.25)
dict_zero_shots_2 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.2)
dict_zero_shots_17 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.17)
dict_zero_shots_15 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.15)
In [390]:
pd.DataFrame(list(dict_zero_shots_25.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_025.csv', index=False)
pd.DataFrame(list(dict_zero_shots_2.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_020.csv', index=False)
pd.DataFrame(list(dict_zero_shots_17.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_017.csv', index=False)
pd.DataFrame(list(dict_zero_shots_15.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_015.csv', index=False)
In [392]:
dict_zero_shots_2[18] = 'crosspost vendor'
dict_zero_shots_2[22] = 'samsara market'
dict_zero_shots_2[23] = 'wsm market'
In [395]:
topic_model.set_topic_labels(dict_zero_shots_2)
In [396]:
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True, custom_labels=True)

image.png

In [397]:
topic_model.visualize_hierarchy(custom_labels=True)

image.png

In [44]:
topic_model.visualize_topics()

image.png

In [400]:
topic_model.visualize_barchart(top_n_topics=25, custom_labels=True, n_words=10)

image.png

In [125]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5175204277038574
Davies_bouldin_score: 0.7919422601150089
In [ ]:
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
In [406]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)
15it [00:24,  1.62s/it]

image.png

In [403]:
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]

results = pd.DataFrame({
    'Document': corpus_valid,
    'Embedding': embeddings_valid,
    'Topic': topics_valid,
    'Probability': probs_valid,
    'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
(38274, 11)
Out[403]:
Document Embedding Topic Probability Created_on Count Name CustomName Representation Representative_Docs UMAP_embedding
0 review empire vendor acidbern [-0.07762138, -0.049061198, -0.046745114, -0.0... 6 0.527385 2020-01-09 1826 6_empire_empire market_empire empire_market empire market [empire, empire market, empire empire, market,... [empire anyone else, empire market back, empir... [9.086779, 3.6718397, 8.9006195, -1.1745992, 1...
1 vendor shipping combine priority [-0.027722627, -0.0031221025, 0.01195772, -0.0... 4 0.962274 2019-11-06 2031 4_order_shipping_package_delivery order [order, shipping, package, delivery, shipped, ... [informed delivery showing package, usa canada... [9.679236, 2.7164314, 8.733615, 0.011899776, 8...
2 open ticket since may ticket [0.055031013, -0.018210536, -0.0026789573, -0.... 25 1.000000 2020-01-09 458 25_ticket_support ticket_support_please ticket support - ask help [ticket, support ticket, support, please, mont... [help support ticket please, help support tick... [9.901975, 5.2703958, 11.463735, 0.47217792, 8...
3 vendor inquiry destroid dream [-0.023196185, 0.0573189, 0.028408512, -0.0222... 9 0.000000 2019-11-06 3010 9_vendor_vendor vendor_inquiry_vendor inquiry inquiry - vendor vendor - vendor [vendor, vendor vendor, inquiry, vendor inquir... [nmm giving vendor runaround lying acting shad... [9.912251, 4.028657, 7.623224, -0.7158077, 9.2...
4 morrison saver stamps uk money maker easiest m... [-0.020903945, 0.050762244, -0.041445963, 0.01... 11 0.799023 2020-01-09 1147 11_counterfeit_id_fake_passport counterfeit money - fake IDs [counterfeit, id, fake, passport, fake id, not... [buy counterfeit money real fake document, buy... [9.859931, 3.1459394, 9.145497, -1.0489817, 9....
In [ ]:
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_400", serialization="pickle", save_ctfidf=True, save_embedding_model=model)
In [405]:
results_final.to_parquet('ResultsBERTopic/BERTopic_all-MiniLM-L6-v2_400.parquet')
In [ ]:
import nbconvert

!jupyter nbconvert --to html show_results.ipynb